From d0c454c23637dceda6d7395dd2141b564e3efa47 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 28 Aug 2025 13:53:14 -0400 Subject: [PATCH] Add ostree-shutdown.service: hide /sysroot and make /etc read-only We have a lot of bind mounts; these are usually set up in the initramfs. So far during shutdown we've let systemd just try to sort things out via auto-generated mount units i.e. `sysroot.mount` and `etc.mount` and so on. systemd has some special casing for `-.mount` (i.e. `/`) and `etc.mount` https://github.com/systemd/systemd/blob/e91bfad241799b449df73efc30d833b9c5937001/src/shared/fstab-util.c#L72 However it doesn't special case `/sysroot` - which is currently an ostree-specific invention (when used in the real root). We cannot actually unmount `/sysroot` while it's in use, and it is because `/etc` is a bind mount into it. And we can't tear down `/etc` because it's just expected that e.g. pid 1 and other things hold open references to it - until things finally transition into systemd-shutdown. What we can do though is explicitly detach it during the shutdown phase; this ensures that systemd won't try to clean it up then, suppressing errors about its inability to do so. While we're here, let's also remount `/etc` read-only; while systemd itself will try to do so during systemd-shutdown. Per comments if this service fails, it's a bug in something else to be fixed. Closes: https://github.com/ostreedev/ostree/issues/3513 Signed-off-by: Colin Walters --- Makefile-boot.am | 2 + src/boot/ostree-shutdown.service | 22 ++++++++ src/libostree/ostree-impl-system-generator.c | 4 ++ src/switchroot/ostree-remount.c | 57 ++++++++++++++++++++ 4 files changed, 85 insertions(+) create mode 100644 src/boot/ostree-shutdown.service diff --git a/Makefile-boot.am b/Makefile-boot.am index 7b7cb892..e26c1816 100644 --- a/Makefile-boot.am +++ b/Makefile-boot.am @@ -38,6 +38,7 @@ endif if BUILDOPT_SYSTEMD systemdsystemunit_DATA = src/boot/ostree-prepare-root.service \ src/boot/ostree-remount.service \ + src/boot/ostree-shutdown.service \ src/boot/ostree-boot-complete.service \ src/boot/ostree-finalize-staged.service \ src/boot/ostree-finalize-staged-hold.service \ @@ -69,6 +70,7 @@ EXTRA_DIST += src/boot/dracut/module-setup.sh \ src/boot/ostree-boot-complete.service \ src/boot/ostree-prepare-root.service \ src/boot/ostree-remount.service \ + src/boot/ostree-shutdown.service \ src/boot/ostree-finalize-staged.service \ src/boot/ostree-finalize-staged-hold.service \ src/boot/ostree-state-overlay@.service \ diff --git a/src/boot/ostree-shutdown.service b/src/boot/ostree-shutdown.service new file mode 100644 index 00000000..24919ec5 --- /dev/null +++ b/src/boot/ostree-shutdown.service @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: LGPL-2.0+ + +[Unit] +Description=OSTree Shutdown +Documentation=man:ostree(1) +DefaultDependencies=no +# Only enabled via generator, but for good measure +ConditionKernelCommandLine=ostree +# Run after core mounts +RequiresMountsFor=/etc /sysroot +# However, we want to only shut down after `/var` has been umounted. +# Since this runs via ExecStop, this Before= is actually After= at shutdown +Before=var.mount +Conflicts=umount.target +Before=umount.target + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStop=/usr/lib/ostree/ostree-remount --shutdown + +# No [Install] section - we're only enabled via generator diff --git a/src/libostree/ostree-impl-system-generator.c b/src/libostree/ostree-impl-system-generator.c index f0116251..65f07fd5 100644 --- a/src/libostree/ostree-impl-system-generator.c +++ b/src/libostree/ostree-impl-system-generator.c @@ -108,6 +108,10 @@ require_internal_units (const char *normal_dir, const char *early_dir, const cha "local-fs.target.requires/ostree-remount.service") < 0) return glnx_throw_errno_prefix (error, "symlinkat"); + if (symlinkat (SYSTEM_DATA_UNIT_PATH "/ostree-shutdown.service", normal_dir_dfd, + "local-fs.target.wants/ostree-shutdown.service") + < 0) + return glnx_throw_errno_prefix (error, "symlinkat"); if (!glnx_shutil_mkdir_p_at (normal_dir_dfd, "multi-user.target.wants", 0755, cancellable, error)) return FALSE; diff --git a/src/switchroot/ostree-remount.c b/src/switchroot/ostree-remount.c index f0a4b3d9..b1c829ea 100644 --- a/src/switchroot/ostree-remount.c +++ b/src/switchroot/ostree-remount.c @@ -42,6 +42,12 @@ #include "ostree-mount-util.h" #include "otcore.h" +static gboolean opt_shutdown; + +static GOptionEntry options[] = { { "shutdown", 'S', 0, G_OPTION_ARG_NONE, &opt_shutdown, + "Perform shutdown unmounting", NULL }, + { NULL } }; + static void do_remount (const char *target, bool writable) { @@ -133,10 +139,61 @@ relabel_dir_for_upper (const char *upper_path, const char *real_path, gboolean i #endif } +// ostree-prepare-root sets things up so that /sysroot points to the "physical" (real) root in the +// initramfs, and then with composefs `/` is an overlay+EROFS that holds references to content in +// that physical filesystem. +// +// In a typical mutable system where the OS is in a mutable `/` (or `/usr), systemd explicitly +// skips unmounting both `/` and `/usr`. It will remount them read-only though - and that's +// the semantic we want to match here. +static void +do_shutdown (void) +{ + const char *sysroot = "/sysroot"; + if (mount (sysroot, sysroot, NULL, MS_REMOUNT | MS_SILENT | MS_RDONLY, NULL) < 0) + { + // Hopefully at this point nothing has any write references, but if they + // do we still want to continue. + perror ("Remounting /sysroot read-only"); + } + // And fully detach it from the mountns because otherwise systemd thinks + // it can be unmounted, but it can't - it's required by `/` (and in a + // composefs setup `/etc`) and possibly `/var`. Again, we only really + // care that it got mounted read-only and hence outstanding data flushed. + // A better fix in the future would be to teach systemd to honor `-.mount` + // having a `Requires=sysroot.mount` meaning we can't unmount the latter. + if (umount2 (sysroot, MNT_DETACH) < 0) + err (EXIT_FAILURE, "umount(/sysroot)"); + + // And finally: /etc + // NOTE! This one is intentionally last in that we want to try to make + // this read-only, but if it fails, systemd-shutdown will have another + // attempt after a process killing spree. If anything happens to be + // holding a writable fd at this point, conceptually it would have + // created race conditions vs ostree-finalize-staged.service, and so + // having this service fail will be a signal that those things need + // to be fixed. + do_remount ("/etc", false); + // Don't add anything else after this. +} + int main (int argc, char *argv[]) { g_autoptr (GError) error = NULL; + g_autoptr (GOptionContext) context = g_option_context_new (""); + g_option_context_add_main_entries (context, options, NULL); + if (!g_option_context_parse (context, &argc, &argv, &error)) + errx (EXIT_FAILURE, "Error parsing options: %s", error->message); + + // Handle the shutdown option + if (opt_shutdown) + { + do_shutdown (); + return 0; + } + // Otherwise fall through to the default startup + g_autoptr (GVariant) ostree_run_metadata_v = NULL; { glnx_autofd int fd = open (OTCORE_RUN_BOOTED, O_RDONLY | O_CLOEXEC); -- 2.30.2